install library
#install.packages("dplyr")
#install.packages("ggplot2")
#install.packages('plotly')
#install.packages("lubridate")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
Read data
theUrl_train <- "/Users/chaoiting/Documents/BUAN 6356//Project/all/train.csv"
Train <- data.frame(read.csv(file = theUrl_train, header = TRUE, sep = ","),split='train')
theUrl_test <- "/Users/chaoiting/Documents/BUAN 6356//Project/all/test.csv"
Test <- data.frame(read.csv(file = theUrl_test, header = TRUE, sep = ","),split='test')
theUrl_store <- "/Users/chaoiting/Documents/BUAN 6356//Project/all/store.csv"
Store <- read.csv(file = theUrl_store, header = TRUE, sep = ",")
check data
summary(Train)
## Store DayOfWeek Date Sales
## Min. : 1.0 Min. :1.000 2013-01-02: 1115 Min. : 0
## 1st Qu.: 280.0 1st Qu.:2.000 2013-01-03: 1115 1st Qu.: 3727
## Median : 558.0 Median :4.000 2013-01-04: 1115 Median : 5744
## Mean : 558.4 Mean :3.998 2013-01-05: 1115 Mean : 5774
## 3rd Qu.: 838.0 3rd Qu.:6.000 2013-01-06: 1115 3rd Qu.: 7856
## Max. :1115.0 Max. :7.000 2013-01-07: 1115 Max. :41551
## (Other) :1010519
## Customers Open Promo StateHoliday
## Min. : 0.0 Min. :0.0000 Min. :0.0000 0:986159
## 1st Qu.: 405.0 1st Qu.:1.0000 1st Qu.:0.0000 a: 20260
## Median : 609.0 Median :1.0000 Median :0.0000 b: 6690
## Mean : 633.1 Mean :0.8301 Mean :0.3815 c: 4100
## 3rd Qu.: 837.0 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :7388.0 Max. :1.0000 Max. :1.0000
##
## SchoolHoliday split
## Min. :0.0000 train:1017209
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.1786
## 3rd Qu.:0.0000
## Max. :1.0000
##
sum(is.na(Train$Store))
## [1] 0
summary(Store)
## Store StoreType Assortment CompetitionDistance
## Min. : 1.0 a:602 a:593 Min. : 20.0
## 1st Qu.: 279.5 b: 17 b: 9 1st Qu.: 717.5
## Median : 558.0 c:148 c:513 Median : 2325.0
## Mean : 558.0 d:348 Mean : 5404.9
## 3rd Qu.: 836.5 3rd Qu.: 6882.5
## Max. :1115.0 Max. :75860.0
## NA's :3
## CompetitionOpenSinceMonth CompetitionOpenSinceYear Promo2
## Min. : 1.000 Min. :1900 Min. :0.0000
## 1st Qu.: 4.000 1st Qu.:2006 1st Qu.:0.0000
## Median : 8.000 Median :2010 Median :1.0000
## Mean : 7.225 Mean :2009 Mean :0.5121
## 3rd Qu.:10.000 3rd Qu.:2013 3rd Qu.:1.0000
## Max. :12.000 Max. :2015 Max. :1.0000
## NA's :354 NA's :354
## Promo2SinceWeek Promo2SinceYear PromoInterval
## Min. : 1.0 Min. :2009 :544
## 1st Qu.:13.0 1st Qu.:2011 Feb,May,Aug,Nov :130
## Median :22.0 Median :2012 Jan,Apr,Jul,Oct :335
## Mean :23.6 Mean :2012 Mar,Jun,Sept,Dec:106
## 3rd Qu.:37.0 3rd Qu.:2013
## Max. :50.0 Max. :2015
## NA's :544 NA's :544
sum(is.na(Store$Store))
## [1] 0
#Check store id is unique
nrow(distinct(Store))==nrow(Store)
## [1] TRUE
#Join train data and store
Train_data <- left_join(Train, Store, by="Store")
summary(Train_data)
## Store DayOfWeek Date Sales
## Min. : 1.0 Min. :1.000 2013-01-02: 1115 Min. : 0
## 1st Qu.: 280.0 1st Qu.:2.000 2013-01-03: 1115 1st Qu.: 3727
## Median : 558.0 Median :4.000 2013-01-04: 1115 Median : 5744
## Mean : 558.4 Mean :3.998 2013-01-05: 1115 Mean : 5774
## 3rd Qu.: 838.0 3rd Qu.:6.000 2013-01-06: 1115 3rd Qu.: 7856
## Max. :1115.0 Max. :7.000 2013-01-07: 1115 Max. :41551
## (Other) :1010519
## Customers Open Promo StateHoliday
## Min. : 0.0 Min. :0.0000 Min. :0.0000 0:986159
## 1st Qu.: 405.0 1st Qu.:1.0000 1st Qu.:0.0000 a: 20260
## Median : 609.0 Median :1.0000 Median :0.0000 b: 6690
## Mean : 633.1 Mean :0.8301 Mean :0.3815 c: 4100
## 3rd Qu.: 837.0 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :7388.0 Max. :1.0000 Max. :1.0000
##
## SchoolHoliday split StoreType Assortment
## Min. :0.0000 train:1017209 a:551627 a:537445
## 1st Qu.:0.0000 b: 15830 b: 8294
## Median :0.0000 c:136840 c:471470
## Mean :0.1786 d:312912
## 3rd Qu.:0.0000
## Max. :1.0000
##
## CompetitionDistance CompetitionOpenSinceMonth CompetitionOpenSinceYear
## Min. : 20 Min. : 1.0 Min. :1900
## 1st Qu.: 710 1st Qu.: 4.0 1st Qu.:2006
## Median : 2330 Median : 8.0 Median :2010
## Mean : 5430 Mean : 7.2 Mean :2009
## 3rd Qu.: 6890 3rd Qu.:10.0 3rd Qu.:2013
## Max. :75860 Max. :12.0 Max. :2015
## NA's :2642 NA's :323348 NA's :323348
## Promo2 Promo2SinceWeek Promo2SinceYear
## Min. :0.0000 Min. : 1.0 Min. :2009
## 1st Qu.:0.0000 1st Qu.:13.0 1st Qu.:2011
## Median :1.0000 Median :22.0 Median :2012
## Mean :0.5006 Mean :23.3 Mean :2012
## 3rd Qu.:1.0000 3rd Qu.:37.0 3rd Qu.:2013
## Max. :1.0000 Max. :50.0 Max. :2015
## NA's :508031 NA's :508031
## PromoInterval
## :508031
## Feb,May,Aug,Nov :118596
## Jan,Apr,Jul,Oct :293122
## Mar,Jun,Sept,Dec: 97460
##
##
##
summary(is.na(Train_data))
## Store DayOfWeek Date Sales
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:1017209 FALSE:1017209 FALSE:1017209 FALSE:1017209
##
## Customers Open Promo StateHoliday
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:1017209 FALSE:1017209 FALSE:1017209 FALSE:1017209
##
## SchoolHoliday split StoreType Assortment
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:1017209 FALSE:1017209 FALSE:1017209 FALSE:1017209
##
## CompetitionDistance CompetitionOpenSinceMonth CompetitionOpenSinceYear
## Mode :logical Mode :logical Mode :logical
## FALSE:1014567 FALSE:693861 FALSE:693861
## TRUE :2642 TRUE :323348 TRUE :323348
## Promo2 Promo2SinceWeek Promo2SinceYear PromoInterval
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:1017209 FALSE:509178 FALSE:509178 FALSE:1017209
## TRUE :508031 TRUE :508031
#Check when Promo2SinceWeek&Year are simultaneously NA and the Promo2 is exactly 0.
checkPromo2 <- Train_data %>%
filter(is.na(Train_data$Promo2SinceWeek)==TRUE & is.na(Train_data$Promo2SinceYear)==TRUE)
nrow(checkPromo2)==508031
## [1] TRUE
sum(checkPromo2$Promo2)
## [1] 0
#Check when CompetitionOpenWeek&Year are simultaneously NA
CompetitionOpen <- Train_data %>%
filter(is.na(Train_data$CompetitionOpenSinceMonth)==TRUE & is.na(Train_data$CompetitionOpenSinceYear)==TRUE)
nrow(CompetitionOpen)==323348
## [1] TRUE
Date format
Train_data <- Train_data %>%
mutate(date=as.Date(Train_data$Date, format="%Y-%m-%d"))%>%
mutate(yearMonth=format(as.Date(date),"%Y-%m"))%>%
mutate(month=month(Date))
average sales and customer without any criteria
#sales distribution
ggplot(data=Train_data,mapping = aes(x=Train_data$Sales))+
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#customer distribution
ggplot(data=Train_data,mapping = aes(x=Train_data$Customers))+
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#relationship between sales and customers
ggplot(data = Train_data,mapping = aes(x=Train_data$Customers,y=Train_data$Sales))+
geom_point()
#store average
storeAvg <- Train_data%>%
filter(Open==1)%>%
group_by(Store)%>%
summarize(mean(Sales),sd(Sales),mean(Customers),sd(Customers),mean(Sales)/mean(Customers))
summary(storeAvg)
## Store mean(Sales) sd(Sales) mean(Customers)
## Min. : 1.0 Min. : 2704 Min. : 576 Min. : 240.2
## 1st Qu.: 279.5 1st Qu.: 5322 1st Qu.:1403 1st Qu.: 541.5
## Median : 558.0 Median : 6590 Median :1773 Median : 678.7
## Mean : 558.0 Mean : 6934 Mean :1846 Mean : 754.5
## 3rd Qu.: 836.5 3rd Qu.: 7964 3rd Qu.:2154 3rd Qu.: 866.2
## Max. :1115.0 Max. :21757 Max. :4872 Max. :3403.5
## sd(Customers) mean(Sales)/mean(Customers)
## Min. : 52.59 Min. : 3.514
## 1st Qu.:104.56 1st Qu.: 8.132
## Median :134.86 Median : 9.464
## Mean :148.98 Mean : 9.644
## 3rd Qu.:174.96 3rd Qu.:10.981
## Max. :654.32 Max. :16.163
#(Year-Month) average sales, and precent change
avgSales_month <- Train_data %>%
group_by(yearMonth)%>%
summarize(mean(Sales))
avgCustomers_month <- Train_data %>%
group_by(yearMonth)%>%
summarize(mean(Customers))
Promotion v.s sales & customer: 1. Average sales and customers with promotion is more higher than no promotion, especially in December. 2. Sales per customer spend with promotion is slightly higher than without promotion. 3. Sales per customer spend among months are not much significantly difference, but still can observe that having promotion in Dec. can increase more sales and customer.
#total sales between promo and no promo
ggplot(data = Train_data, mapping=aes(x=Train_data$Promo, y=Train_data$Sales))+
geom_bar(stat = "identity")
#total customers between promo and no promo
ggplot(data = Train_data, mapping=aes(x=Train_data$Promo, y=Train_data$Customers))+
geom_bar(stat = "identity")
#(Year-Month) average sales&customer with promo
avgSales_month_promo <- Train_data %>%
filter(Promo==1)%>%
group_by(yearMonth)%>%
summarize(mean(Sales))
avgCustomers_month_promo <- Train_data %>%
filter(Promo==1)%>%
group_by(yearMonth)%>%
summarize(mean(Customers))
#(Year-Month) average sales&customer without promo
avgSales_month_withoutpromo <- Train_data %>%
filter(Promo==0)%>%
group_by(yearMonth)%>%
summarize(mean(Sales))
avgCustomers_month_withoutpromo <- Train_data %>%
filter(Promo==0)%>%
group_by(yearMonth)%>%
summarize(mean(Customers))
#cbind avg sale & avg customer, with promo, without promo
compareAvgSale_month <- cbind(avgSales_month, avgSales_month_promo$`mean(Sales)`, avgSales_month_withoutpromo$`mean(Sales)`)
names(compareAvgSale_month)[3]<-"avgSalesMeanPromo"
names(compareAvgSale_month)[4]<-"avgSalesMeanNoPromo"
compareAvgCustomers_month <- cbind(avgCustomers_month, avgCustomers_month_promo$`mean(Customers)`, avgCustomers_month_withoutpromo$`mean(Customers)`)
names(compareAvgCustomers_month)[3]<-"avgCusMeanPromo"
names(compareAvgCustomers_month)[4]<-"avgCusMeanNoPromo"
#compare average sales by year-month
ggplot(data = compareAvgSale_month, mapping=aes(x=compareAvgSale_month$yearMonth))+
geom_point(mapping=aes(y=compareAvgSale_month$avgSalesMeanPromo), colour="red")+
geom_point(mapping=aes(y=compareAvgSale_month$avgSalesMeanNoPromo), colour="blue")
#compare average customers by year-month
ggplot(data = compareAvgCustomers_month, mapping=aes(x=compareAvgCustomers_month$yearMonth))+
geom_point(mapping=aes(y=compareAvgCustomers_month$avgCusMeanPromo), colour="red")+
geom_point(mapping=aes(y=compareAvgCustomers_month$avgCusMeanNoPromo), colour="blue")
#(byMonth) average sales&customer with promo
avgSales_month2_promo <- Train_data %>%
filter(Open==1)%>%
filter(Promo==1)%>%
group_by(month(Date))%>%
summarize(mean(Sales))
names(avgSales_month2_promo)[2] <- "meanSales_promo"
avgCustomers_month2_promo <- Train_data %>%
filter(Open==1)%>%
filter(Promo==1)%>%
group_by(month(Date))%>%
summarize(mean(Customers))
names(avgCustomers_month2_promo)[2] <- "meanCustomers_promo"
#(byMonth) average sales & customer with no promo
avgSales_month2_noPromo <- Train_data %>%
filter(Open==1)%>%
filter(Promo==0)%>%
group_by(month(Date))%>%
summarize(mean(Sales))
names(avgSales_month2_noPromo)[2] <- "meanSales_nopromo"
avgCustomers_month2_noPromo <- Train_data %>%
filter(Open==1)%>%
filter(Promo==0)%>%
group_by(month(Date))%>%
summarize(mean(Customers))
names(avgCustomers_month2_noPromo)[2] <- "meanCustomers_nopromo"
#cbind
compareByMonth_saleCustomer <- cbind(avgSales_month2_promo,avgCustomers_month2_promo[,-1],avgSales_month2_noPromo[,-1],avgCustomers_month2_noPromo[,-1])
compareByMonth_saleCustomer <- compareByMonth_saleCustomer%>%
mutate(per_promo=compareByMonth_saleCustomer$meanSales_promo/compareByMonth_saleCustomer$meanCustomers_promo,per_noPromo=compareByMonth_saleCustomer$meanSales_nopromo/compareByMonth_saleCustomer$meanCustomers_nopromo)
#compare sales by month
ggplot(data = compareByMonth_saleCustomer, mapping = aes(x=compareByMonth_saleCustomer$`month(Date)`))+
geom_line(mapping = aes(y=compareByMonth_saleCustomer$meanSales_promo),colour="red")+
geom_line(mapping = aes(y=compareByMonth_saleCustomer$meanSales_nopromo),colour="blue")+
scale_x_continuous(breaks = 1:12)+
xlab("Month")+
ylab("Average sales")
#compare customer by month
ggplot(data = compareByMonth_saleCustomer, mapping = aes(x=compareByMonth_saleCustomer$`month(Date)`))+
geom_line(mapping = aes(y=compareByMonth_saleCustomer$meanCustomers_promo),colour="red")+
geom_line(mapping = aes(y=compareByMonth_saleCustomer$meanCustomers_nopromo),colour="blue")+
scale_x_continuous(breaks = 1:12)+
xlab("Month")+
ylab("Average customers")
#compare per customer spend by month
ggplot(data = compareByMonth_saleCustomer, mapping = aes(x=compareByMonth_saleCustomer$`month(Date)`))+
geom_line(mapping = aes(y=compareByMonth_saleCustomer$per_promo),colour="red")+
geom_line(mapping = aes(y=compareByMonth_saleCustomer$per_noPromo),colour="blue")+
scale_x_continuous(breaks = 1:12)+
xlab("Month")+
ylab("Average per customer spend")
Store types & Assortment v.s sales&customer: 1.Total sales, customers and sales per customer spend of store type B are lower than other store types, but the average sales, customers and sales per customer spend are higher than other store types, it may because the number of type B store is very few. 2. Total sales, customers and sales per customer spend of assortment B are lower than other assortments, but the average sales, customers and sales per customer spend are higher than other assortments, it may because the number of assortments B is very few. 3. Store types and assortment with promotion seems still a little higher than those without promotion. 4. The store type B with assortment C has the highest average sales and customers.
#store type v.s assortment
ggplot(data = Train_data,mapping = aes(x=Train_data$StoreType,y=Train_data$Assortment))+
geom_point()
#StoreType: average sale, customer and sales per customer spend
avg_type <- Train_data %>%
filter(Open==1)%>%
group_by(StoreType)%>%
summarize(n(),mean(Sales),sd(Sales),mean(Customers),sd(Customers),mean(Sales)/mean(Customers))
#store type: total sales: promotion
ggplot(data = Train_data, mapping = aes(fill=factor(Train_data$Promo),x=Train_data$StoreType,y=Train_data$Sales))+
geom_bar(stat = "identity")+
xlab("StoreType")+
ylab("Total sales")
#store type: total customers: promotion
ggplot(data = Train_data, mapping = aes(fill=factor(Train_data$Promo),x=Train_data$StoreType,y=Train_data$Customers))+
geom_bar(stat = "identity")+
xlab("StoreType")+
ylab("Total customers")
#store type: average sales
ggplot(data=avg_type, mapping = aes(x=avg_type$StoreType,y=avg_type$`mean(Sales)`))+
geom_bar(stat = "identity")+
xlab("StoreType")+
ylab("Average sales")
#store type: average customers
ggplot(data=avg_type, mapping = aes(x=avg_type$StoreType,y=avg_type$`mean(Customers)`))+
geom_bar(stat = "identity")+
xlab("StoreType")+
ylab("Average customers")
#store type: sales per customer spend
ggplot(data=avg_type, mapping = aes(x=avg_type$StoreType,y=avg_type$`mean(Sales)/mean(Customers)`))+
geom_bar(stat = "identity")+
xlab("StoreType")+
ylab("Average sales per customer")
#Assortment: average sale, customer and sales per customer spend
avg_assortment <- Train_data %>%
filter(Open==1)%>%
group_by(Assortment)%>%
summarize(n(),mean(Sales),sd(Sales),mean(Customers),sd(Customers),mean(Sales)/mean(Customers))
#Assortment: total sales: promotion
ggplot(data = Train_data, mapping = aes(fill=factor(Train_data$Promo),x=Train_data$Assortment,y=Train_data$Sales))+
geom_bar(stat = "identity")+
xlab("Assortment")+
ylab("Total sales")
#Assortment: total customers: promotion
ggplot(data = Train_data, mapping = aes(fill=factor(Train_data$Promo),x=Train_data$Assortment,y=Train_data$Customers))+
geom_bar(stat = "identity")+
xlab("Assortment")+
ylab("Total customers")
#Assortment: average sales
ggplot(data=avg_assortment, mapping = aes(x=avg_assortment$Assortment,y=avg_assortment$`mean(Sales)`))+
geom_bar(stat = "identity")+
xlab("Assortment")+
ylab("Average sales")
#Assortment: average customers
ggplot(data=avg_assortment, mapping = aes(x=avg_assortment$Assortment,y=avg_assortment$`mean(Customers)`))+
geom_bar(stat = "identity")+
xlab("Assortment")+
ylab("Average customers")
#Assortment: sales per customer spend
ggplot(data=avg_assortment, mapping = aes(x=avg_assortment$Assortment,y=avg_assortment$`mean(Sales)/mean(Customers)`))+
geom_bar(stat = "identity")+
xlab("Assortment")+
ylab("Average sales per customer")
#store type * assortment
aa<- Train_data%>%
filter(Open=="1"&StoreType=="a"&Assortment=="a")%>%
summarize(mean(Sales),mean(Customers))%>%
mutate(index="aa")
names(aa)[1] <-"Sales"
names(aa)[2] <-"Customers"
ab<- Train_data%>%
filter(Open=="1"&StoreType=="a"&Assortment=="b")%>%
summarize(mean(Sales),mean(Customers))%>%
mutate(index="ab")
names(ab)[1] <-"Sales"
names(ab)[2] <-"Customers"
ac<-Train_data%>%
filter(Open=="1"&StoreType=="a"&Assortment=="c")%>%
summarize(mean(Sales),mean(Customers))%>%
mutate(index="ac")
names(ac)[1] <-"Sales"
names(ac)[2] <-"Customers"
ba<-Train_data%>%
filter(Open=="1"&StoreType=="b"&Assortment=="a")%>%
summarize(mean(Sales),mean(Customers))%>%
mutate(index="ba")
names(ba)[1] <-"Sales"
names(ba)[2] <-"Customers"
bb<-Train_data%>%
filter(Open=="1"&StoreType=="b"&Assortment=="b")%>%
summarize(mean(Sales),mean(Customers))%>%
mutate(index="bb")
names(bb)[1] <-"Sales"
names(bb)[2] <-"Customers"
bc<-Train_data%>%
filter(Open=="1"&StoreType=="b"&Assortment=="c")%>%
summarize(mean(Sales),mean(Customers))%>%
mutate(index="bc")
names(bc)[1] <-"Sales"
names(bc)[2] <-"Customers"
ca<-Train_data%>%
filter(Open=="1"&StoreType=="c"&Assortment=="a")%>%
summarize(mean(Sales),mean(Customers))%>%
mutate(index="ca")
names(ca)[1] <-"Sales"
names(ca)[2] <-"Customers"
cb<-Train_data%>%
filter(Open=="1"&StoreType=="c"&Assortment=="b")%>%
summarize(mean(Sales),mean(Customers))%>%
mutate(index="cb")
names(cb)[1] <-"Sales"
names(cb)[2] <-"Customers"
cc<-Train_data%>%
filter(Open=="1"&StoreType=="c"&Assortment=="c")%>%
summarize(mean(Sales),mean(Customers))%>%
mutate(index="cc")
names(cc)[1] <-"Sales"
names(cc)[2] <-"Customers"
da<-Train_data%>%
filter(Open=="1"&StoreType=="d"&Assortment=="a")%>%
summarize(mean(Sales),mean(Customers))%>%
mutate(index="da")
names(da)[1] <-"Sales"
names(da)[2] <-"Customers"
db<-Train_data%>%
filter(Open=="1"&StoreType=="d"&Assortment=="b")%>%
summarize(mean(Sales),mean(Customers))%>%
mutate(index="db")
names(db)[1] <-"Sales"
names(db)[2] <-"Customers"
dc<-Train_data%>%
filter(Open=="1"&StoreType=="d"&Assortment=="c")%>%
summarize(mean(Sales),mean(Customers))%>%
mutate(index="dc")
names(dc)[1] <-"Sales"
names(dc)[2] <-"Customers"
#store type * assortment: average ales
compareType.Assortment <- rbind(aa,ab,ac,ba,bb,bc,ca,cb,cc,da,db,dc)
ggplotly(
ggplot(data = compareType.Assortment, mapping = aes(x=compareType.Assortment$index,y=compareType.Assortment$Sales))+
geom_bar(stat = "identity")+
xlab("Store type * Assortment")+
ylab("Average sales")
)
## Warning: Removed 3 rows containing missing values (position_stack).
#store type * assortment: average customers
compareType.Assortment <- rbind(aa,ab,ac,ba,bb,bc,ca,cb,cc,da,db,dc)
ggplotly(
ggplot(data = compareType.Assortment, mapping = aes(x=compareType.Assortment$index,y=compareType.Assortment$Customers))+
geom_bar(stat = "identity")+
xlab("Store type * Assortment")+
ylab("Average customers")
)
## Warning: Removed 3 rows containing missing values (position_stack).
DayOfWeek v.s Sales& Customers: 1. The slaes and customers on Sunday are veey low because few stores are opened and storeType C not open on Sunday. 2. While Sunday has the lowest sales and customer, its average sales and customer are the highest. It may because there’s only few stores opened. Its sales per customer spend is also really high even there’s no promotion on Sunday. 3. It seems like Monday has the highest total sales and customer, and its average sales and customer are also very high. Sales per customer spend is the highest on Monday when stores have promotion. 4. Promotion definitely affect sales and customers, surprisingly Saturday and Sunday don’t have promotion.
#Day of week : total sales : store type
ggplot(data = Train_data, mapping=aes(x=Train_data$DayOfWeek, y=Train_data$Sales))+
geom_bar(stat = "identity", aes(color=Train_data$StoreType))+
xlab("Day of week")+
ylab("Average sales")
#Day of week : total customers : store type
ggplot(data = Train_data, mapping=aes(x=Train_data$DayOfWeek, y=Train_data$Customers))+
geom_bar(stat = "identity",aes(color=Train_data$StoreType))+
xlab("Day of week")+
ylab("Average customers")
#the opened stores' average sales and customers each day of week
dayofweek_open <- Train_data%>%
filter(Open==1)%>%
group_by(DayOfWeek)%>%
summarize(n(),mean(Sales),mean(Customers))
ggplot(data = dayofweek_open, mapping = aes(x=dayofweek_open$DayOfWeek))+
geom_bar(stat = "identity", mapping = aes(y=dayofweek_open$`mean(Sales)`),colour="green")+
geom_bar(stat = "identity", mapping = aes(y=dayofweek_open$`mean(Customers)`),colour="yellow")
#opened stores with promotion or not : average sales and customers
dayofweek_open_promo <- Train_data%>%
filter(Open==1 & Promo==1)%>%
group_by(DayOfWeek)%>%
summarize(n(),mean(Sales),mean(Customers))
dayofweek_open_promo[nrow(dayofweek_open_promo)+1,]=list("6",0,0,0)
dayofweek_open_promo[nrow(dayofweek_open_promo)+1,]=list("7",0,0,0)
dayofweek_open_noPromo <- Train_data%>%
filter(Open==1 & Promo==0)%>%
group_by(DayOfWeek)%>%
summarize(n(),mean(Sales),mean(Customers))
#cbind
compareDayofweek_open <- cbind(dayofweek_open_promo[,-2],dayofweek_open_noPromo[,c(-1,-2)])
names(compareDayofweek_open)[c(2,3,4,5)] <- c("promo_meanSales","promo_meanCustomers","noPromo_meanSales","noPromo_meanSCustomers")
#compare average sales with or without promotion
ggplot(data = compareDayofweek_open, mapping = aes(x=compareDayofweek_open$DayOfWeek))+
geom_point(mapping = aes(y=compareDayofweek_open$promo_meanSales),colour="red")+
geom_point(mapping = aes(y=compareDayofweek_open$noPromo_meanSales),colour="blue")+
xlab("Day of week")+
ylab("Average sales")
#compare average sales per customer spend with or without promotion
ggplot(data = compareDayofweek_open, mapping = aes(x=compareDayofweek_open$DayOfWeek))+
geom_point(mapping = aes(y=compareDayofweek_open$promo_meanSales/compareDayofweek_open$promo_meanCustomers),colour="red")+
geom_point(mapping = aes(y=compareDayofweek_open$noPromo_meanSales/compareDayofweek_open$noPromo_meanSCustomers),colour="blue")
## Warning: Removed 2 rows containing missing values (geom_point).
stateHoliday: 1. No promotion running on any stores on Christmas (c), only store type B and D opened on Christmas and store type C not opened on Easter day 2. Store type B has the significant total sales on each holiday.
#opened stores on state holiday
stateHoliday <- Train_data %>%
filter(Open==1)%>%
filter(StateHoliday==c("a","b","c"))
#state holiday : total sales : promotion
ggplot(data = stateHoliday, mapping = aes(fill=factor(stateHoliday$Promo),x=stateHoliday$StateHoliday, y=stateHoliday$Sales))+
geom_bar(stat = "identity",position = "dodge")+
xlab("State holiday")+
ylab("Total sales")
#state holiday : total sales : store type
ggplot(data = stateHoliday, mapping = aes(fill=factor(stateHoliday$StoreType),x=stateHoliday$StateHoliday, y=stateHoliday$Sales))+
geom_bar(stat = "identity",position = "dodge")+
xlab("State holiday")+
ylab("Total sales")
#state holiday : total sales : assortment
ggplot(data = stateHoliday, mapping = aes(fill=factor(stateHoliday$Assortment),x=stateHoliday$StateHoliday, y=stateHoliday$Sales))+
geom_bar(stat = "identity",position = "dodge")+
xlab("State holiday")+
ylab("Total sales")
#state holiday : total customers : promotion
ggplot(data = stateHoliday, mapping = aes(fill=factor(stateHoliday$Promo),x=stateHoliday$StateHoliday, y=stateHoliday$Customers))+
geom_bar(stat = "identity",position = "dodge")
#state holiday : total customers : store type
ggplot(data = stateHoliday, mapping = aes(fill=factor(stateHoliday$StoreType),x=stateHoliday$StateHoliday, y=stateHoliday$Customers))+
geom_bar(stat = "identity",position = "dodge")
#state holiday : total customers : assortment
ggplot(data = stateHoliday, mapping = aes(fill=factor(stateHoliday$Assortment),x=stateHoliday$StateHoliday, y=stateHoliday$Customers))+
geom_bar(stat = "identity",position = "dodge")
schoolHoliday: 1. It seems like there is not a specific store type or assortment belonged to school. 2. Also, the promotion running or not do not affected by whether it is a school store or not.
Train_data%>%
filter(Open==1&Promo==1)%>%
group_by(SchoolHoliday)%>%
summarize(n(),mean(Sales),mean(Customers),mean(Sales)/mean(Customers))
## # A tibble: 2 x 5
## SchoolHoliday `n()` `mean(Sales)` `mean(Customers)` `mean(Sales)/mean(~
## <int> <int> <dbl> <dbl> <dbl>
## 1 0 299133 8199. 843. 9.72
## 2 1 77763 8341. 849. 9.82
Train_data%>%
filter(Open==1&Promo==0)%>%
group_by(SchoolHoliday)%>%
summarize(n(),mean(Sales),mean(Customers),mean(Sales)/mean(Customers))
## # A tibble: 2 x 5
## SchoolHoliday `n()` `mean(Sales)` `mean(Customers)` `mean(Sales)/mean(~
## <int> <int> <dbl> <dbl> <dbl>
## 1 0 381802 5876. 691. 8.50
## 2 1 85694 6165. 723. 8.53
#Store type and assortment affected by school holiday
Train_data%>%
filter(SchoolHoliday==1)%>%
distinct(Assortment)
## Assortment
## 1 a
## 2 c
## 3 b
Train_data%>%
filter(SchoolHoliday==1)%>%
distinct(StoreType)
## StoreType
## 1 c
## 2 a
## 3 d
## 4 b
#school holiday : total sales : promotion
ggplot(data = Train_data, mapping = aes(fill=factor(Train_data$Promo),x=Train_data$SchoolHoliday, y=Train_data$Sales))+
geom_bar(stat = "identity",position = "dodge")+
xlab("School holiday")+
ylab("Total sales")
Promo2: Stores which keep participating promotion earn less average sales and customers than those not participate continuing promotion, but its sales per customer spend in store is higher.
#promo2 : total sales
ggplot(data = Train_data, mapping=aes(x=Train_data$Promo2, y=Train_data$Sales))+
geom_bar(stat = "identity")
#promo2 : total customers
ggplot(data = Train_data, mapping=aes(x=Train_data$Promo2, y=Train_data$Customers))+
geom_bar(stat = "identity")
#(byMonth) opened stores' average sales and customer with promo2
avgSales_month_promo2 <- Train_data %>%
filter(Open==1)%>%
filter(Promo2==1)%>%
group_by(month(Date))%>%
summarize(mean(Sales))
names(avgSales_month_promo2)[2] <- "meanSales_promo2"
avgCustomers_month_promo2 <- Train_data %>%
filter(Open==1)%>%
filter(Promo2==1)%>%
group_by(month(Date))%>%
summarize(mean(Customers))
names(avgCustomers_month_promo2)[2] <- "meanCustomers_promo2"
#(byMonth) average sales and customer with no promo2
avgSales_month_noPromo2 <- Train_data %>%
filter(Open==1)%>%
filter(Promo2==0)%>%
group_by(month(Date))%>%
summarize(mean(Sales))
names(avgSales_month_noPromo2)[2] <- "meanSales_nopromo2"
avgCustomers_month_noPromo2 <- Train_data %>%
filter(Open==1)%>%
filter(Promo2==0)%>%
group_by(month(Date))%>%
summarize(mean(Customers))
names(avgCustomers_month_noPromo2)[2] <- "meanCustomers_nopromo2"
#cbind
compareByMonth_saleCustomer2 <- cbind(avgSales_month_promo2,avgCustomers_month_promo2[,-1],avgSales_month_noPromo2[,-1],avgCustomers_month_noPromo2[,-1])
compareByMonth_saleCustomer2 <- compareByMonth_saleCustomer2%>%
mutate(per_promo2=compareByMonth_saleCustomer2$meanSales_promo2/compareByMonth_saleCustomer2$meanCustomers_promo2,per_noPromo2=compareByMonth_saleCustomer2$meanSales_nopromo2/compareByMonth_saleCustomer2$meanCustomers_nopromo2)
#compare average sales by month with promo2
ggplot(data = compareByMonth_saleCustomer2, mapping = aes(x=compareByMonth_saleCustomer2$`month(Date)`))+
geom_line(mapping = aes(y=compareByMonth_saleCustomer2$meanSales_promo2),colour="red")+
geom_line(mapping = aes(y=compareByMonth_saleCustomer2$meanSales_nopromo2),colour="blue")+
scale_x_continuous(breaks = 1:12)+
xlab("Month")+
ylab("Average sales")
#compare average customers by month
ggplot(data = compareByMonth_saleCustomer2, mapping = aes(x=compareByMonth_saleCustomer2$`month(Date)`))+
geom_line(mapping = aes(y=compareByMonth_saleCustomer2$meanCustomers_promo2),colour="red")+
geom_line(mapping = aes(y=compareByMonth_saleCustomer2$meanCustomers_nopromo2),colour="blue")+
scale_x_continuous(breaks = 1:12)+
xlab("Month")+
ylab("Average customers")
#compare per customer spend by month
ggplot(data = compareByMonth_saleCustomer2, mapping = aes(x=compareByMonth_saleCustomer2$`month(Date)`))+
geom_line(mapping = aes(y=compareByMonth_saleCustomer2$per_promo2),colour="red")+
geom_line(mapping = aes(y=compareByMonth_saleCustomer2$per_noPromo2),colour="blue")+
scale_x_continuous(breaks = 1:12)+
xlab("Month")+
ylab("Average per customer spend")
promotion vs continuing promotion: The stores’ average sales with promotion as well as continuing promotion are lower than those do not participate continuing promotion. It may because the longer period drag down the average. But the average per customer spend with promotion as well as continuing promotion is still higher than others.
#the average sales and customers with all conbinations of promo and promo2
Promo11 <- Train_data%>%
filter(Open==1 & Promo==1 & Promo2==1)%>%
group_by(month)%>%
summarize(mean(Sales),mean(Customers))
names(Promo11)[2] <- "promo11meanSales"
names(Promo11)[3] <- "promo11meanCustomers"
Promo01 <- Train_data%>%
filter(Open==1 & Promo==0 & Promo2==1)%>%
group_by(month)%>%
summarize(mean(Sales),mean(Customers))
names(Promo01)[2] <- "promo01meanSales"
names(Promo01)[3] <- "promo01meanCustomers"
Promo10 <- Train_data%>%
filter(Open==1 & Promo==1 & Promo2==0)%>%
group_by(month)%>%
summarize(mean(Sales),mean(Customers))
names(Promo10)[2] <- "promo10meanSales"
names(Promo10)[3] <- "promo10meanCustomers"
Promo00 <- Train_data%>%
filter(Open==1 & Promo==0 & Promo2==0)%>%
group_by(month)%>%
summarize(mean(Sales),mean(Customers))
names(Promo00)[2] <- "promo00meanSales"
names(Promo00)[3] <- "promo00meanCustomers"
#cbind
comparePromo <- cbind(Promo00,Promo01[,-1],Promo10[,-1],Promo11[,-1])
#average sales
ggplot(data = comparePromo, mapping = aes(x=comparePromo$month))+
geom_line(mapping = aes(y=comparePromo$promo00meanSales))+
geom_line(mapping = aes(y=comparePromo$promo01meanSales),colour="orange")+
geom_line(mapping = aes(y=comparePromo$promo10meanSales),colour="blue")+
geom_line(mapping = aes(y=comparePromo$promo11meanSales),colour="red")+
scale_x_continuous(breaks = 1:12)+
xlab("Month")+
ylab("average sales")
#average sales per customer spend
ggplot(data = comparePromo, mapping = aes(x=comparePromo$month))+
geom_line(mapping = aes(y=comparePromo$promo00meanSales/comparePromo$promo00meanCustomers))+
geom_line(mapping = aes(y=comparePromo$promo01meanSales/comparePromo$promo01meanCustomers),colour="orange")+
geom_line(mapping = aes(y=comparePromo$promo10meanSales/comparePromo$promo10meanCustomers),colour="blue")+
geom_line(mapping = aes(y=comparePromo$promo11meanSales/comparePromo$promo11meanCustomers),colour="red")+
scale_x_continuous(breaks = 1:12)+
xlab("Month")+
ylab("sales per customer spend")
predict sales Assume customer numbers are average of that month (using history data)
Train_data <- Train_data%>%
mutate(dayOfWeekDummy=ifelse(DayOfWeek==7,0,1))
Train_train <- Train_data%>%
filter(Open=="1")
rs_test <- sample(nrow(Train_train),nrow(Train_train)-800000,replace = FALSE)
Train_test <- Train_train[rs_test,]
Train_train2 <- Train_train[-(rs_test),]
#test data
Test_data <- left_join(Test, Store, by="Store")
Test_data <- Test_data %>%
mutate(date=as.Date(Test_data$Date, format="%Y-%m-%d"))%>%
mutate(yearMonth=format(as.Date(date),"%Y-%m"))%>%
mutate(month=month(Date))
Test_data <- Test_data%>%
mutate(dayOfWeekDummy=ifelse(DayOfWeek==7,0,1))
Test_data <- Test_data%>%
filter(Open=="1")
#get average customers in August and September
Customers89 <- Train_data%>%
filter(month==c(8,9))%>%
group_by(Store)%>%
summarize(mean(Customers))
## Warning in month == c(8, 9): longer object length is not a multiple of
## shorter object length
names(Customers89)[2]<-"Customers"
Test_data <- left_join(Test_data, Customers89, by="Store")
############################################################################
#model
lm_Sales1 <- lm(Sales~dayOfWeekDummy+as.factor(month)+Promo+StateHoliday+SchoolHoliday+StoreType+Assortment+Promo2+CompetitionDistance, Train_train)
summary(lm_Sales1)
##
## Call:
## lm(formula = Sales ~ dayOfWeekDummy + as.factor(month) + Promo +
## StateHoliday + SchoolHoliday + StoreType + Assortment + Promo2 +
## CompetitionDistance, data = Train_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10519 -1771 -409 1264 35365
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.892e+03 4.936e+01 119.361 < 2e-16 ***
## dayOfWeekDummy -2.477e+02 4.816e+01 -5.144 2.69e-07 ***
## as.factor(month)2 1.038e+02 1.346e+01 7.708 1.28e-14 ***
## as.factor(month)3 3.428e+02 1.323e+01 25.917 < 2e-16 ***
## as.factor(month)4 3.999e+02 1.343e+01 29.768 < 2e-16 ***
## as.factor(month)5 5.338e+02 1.347e+01 39.624 < 2e-16 ***
## as.factor(month)6 4.868e+02 1.337e+01 36.417 < 2e-16 ***
## as.factor(month)7 1.870e+02 1.351e+01 13.848 < 2e-16 ***
## as.factor(month)8 -7.524e+01 1.560e+01 -4.824 1.41e-06 ***
## as.factor(month)9 -3.299e+00 1.521e+01 -0.217 0.8283
## as.factor(month)10 -6.857e+00 1.521e+01 -0.451 0.6522
## as.factor(month)11 5.174e+02 1.533e+01 33.763 < 2e-16 ***
## as.factor(month)12 2.057e+03 1.542e+01 133.443 < 2e-16 ***
## Promo 2.317e+03 6.029e+00 384.300 < 2e-16 ***
## StateHolidaya 2.449e+01 1.048e+02 0.234 0.8153
## StateHolidayb 5.764e+02 2.284e+02 2.524 0.0116 *
## StateHolidayc -1.337e+03 3.263e+02 -4.098 4.16e-05 ***
## SchoolHoliday 2.994e+02 8.357e+00 35.830 < 2e-16 ***
## StoreTypeb 5.039e+03 3.322e+01 151.697 < 2e-16 ***
## StoreTypec -1.124e+02 9.164e+00 -12.265 < 2e-16 ***
## StoreTyped -2.083e+02 7.013e+00 -29.696 < 2e-16 ***
## Assortmentb -3.022e+03 4.406e+01 -68.588 < 2e-16 ***
## Assortmentc 8.533e+02 6.261e+00 136.285 < 2e-16 ***
## Promo2 -7.581e+02 6.093e+00 -124.415 < 2e-16 ***
## CompetitionDistance -2.376e-02 3.968e-04 -59.883 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2741 on 842181 degrees of freedom
## (2186 observations deleted due to missingness)
## Multiple R-squared: 0.2207, Adjusted R-squared: 0.2206
## F-statistic: 9936 on 24 and 842181 DF, p-value: < 2.2e-16
lm_Sales2 <- lm(Sales~DayOfWeek+Promo+SchoolHoliday+StoreType+Assortment+Promo2+CompetitionDistance, Train_train2)
summary(lm_Sales2)
##
## Call:
## lm(formula = Sales ~ DayOfWeek + Promo + SchoolHoliday + StoreType +
## Assortment + Promo2 + CompetitionDistance, data = Train_train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10309 -1818 -449 1263 35031
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.616e+03 1.035e+01 639.04 <2e-16 ***
## DayOfWeek -1.480e+02 1.899e+00 -77.93 <2e-16 ***
## Promo 2.164e+03 6.519e+00 332.01 <2e-16 ***
## SchoolHoliday 1.264e+02 7.929e+00 15.94 <2e-16 ***
## StoreTypeb 5.148e+03 3.365e+01 153.01 <2e-16 ***
## StoreTypec -1.117e+02 9.522e+00 -11.73 <2e-16 ***
## StoreTyped -2.113e+02 7.283e+00 -29.02 <2e-16 ***
## Assortmentb -3.022e+03 4.575e+01 -66.06 <2e-16 ***
## Assortmentc 8.573e+02 6.502e+00 131.86 <2e-16 ***
## Promo2 -7.656e+02 6.327e+00 -121.01 <2e-16 ***
## CompetitionDistance -2.349e-02 4.110e-04 -57.16 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2771 on 797920 degrees of freedom
## (2069 observations deleted due to missingness)
## Multiple R-squared: 0.2033, Adjusted R-squared: 0.2033
## F-statistic: 2.036e+04 on 10 and 797920 DF, p-value: < 2.2e-16
#assuming the number of customers is the average of the previous years
lm_Sales3 <- lm(Sales~Customers+Promo+StoreType, Train_train2)
summary(lm_Sales3)
##
## Call:
## lm(formula = Sales ~ Customers + Promo + StoreType, data = Train_train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28443.1 -814.4 -135.4 660.9 28414.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.937e+02 4.084e+00 145.37 <2e-16 ***
## Customers 7.288e+00 4.456e-03 1635.43 <2e-16 ***
## Promo 1.194e+03 3.149e+00 379.33 <2e-16 ***
## StoreTypeb -5.555e+03 1.270e+01 -437.45 <2e-16 ***
## StoreTypec -1.412e+02 4.666e+00 -30.26 <2e-16 ***
## StoreTyped 1.275e+03 3.554e+00 358.85 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1366 on 799994 degrees of freedom
## Multiple R-squared: 0.8063, Adjusted R-squared: 0.8063
## F-statistic: 6.661e+05 on 5 and 799994 DF, p-value: < 2.2e-16
ggplot(lm_Sales3, aes(x=.resid))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(lm_Sales3,aes(sample=.stdresid))+stat_qq()+geom_abline()
#best
lm_Sales3.1 <- lm(Sales~Customers*StoreType+Promo, Train_train2)
summary(lm_Sales3.1)
##
## Call:
## lm(formula = Sales ~ Customers * StoreType + Promo, data = Train_train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28513.5 -771.4 -89.8 662.4 28353.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.226e+02 4.528e+00 137.50 <2e-16 ***
## Customers 7.307e+00 5.149e-03 1419.14 <2e-16 ***
## StoreTypeb -6.487e+02 2.631e+01 -24.66 <2e-16 ***
## StoreTypec -3.063e+02 1.145e+01 -26.74 <2e-16 ***
## StoreTyped -5.049e+02 9.506e+00 -53.11 <2e-16 ***
## Promo 1.096e+03 2.998e+00 365.71 <2e-16 ***
## Customers:StoreTypeb -2.441e+00 1.276e-02 -191.29 <2e-16 ***
## Customers:StoreTypec 2.022e-01 1.301e-02 15.54 <2e-16 ***
## Customers:StoreTyped 2.942e+00 1.410e-02 208.61 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1293 on 799991 degrees of freedom
## Multiple R-squared: 0.8265, Adjusted R-squared: 0.8265
## F-statistic: 4.764e+05 on 8 and 799991 DF, p-value: < 2.2e-16
ggplot(lm_Sales3.1, aes(x=.resid))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(lm_Sales3.1,aes(sample=.stdresid))+stat_qq()+geom_abline()
lm_Sales4 <- lm(Sales~Customers+Promo+CompetitionDistance, Train_train2)
summary(lm_Sales4)
##
## Call:
## lm(formula = Sales ~ Customers + Promo + CompetitionDistance,
## data = Train_train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21087.3 -912.7 -148.1 790.8 29449.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.468e+03 4.352e+00 337.3 <2e-16 ***
## Customers 6.147e+00 4.598e-03 1336.8 <2e-16 ***
## Promo 1.393e+03 3.672e+00 379.3 <2e-16 ***
## CompetitionDistance 3.258e-02 2.323e-04 140.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1602 on 797927 degrees of freedom
## (2069 observations deleted due to missingness)
## Multiple R-squared: 0.7336, Adjusted R-squared: 0.7336
## F-statistic: 7.324e+05 on 3 and 797927 DF, p-value: < 2.2e-16
ggplot(lm_Sales4, aes(x=.resid))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
lm_Sales5 <- lm(Sales~Customers+Promo+Assortment, Train_train2)
summary(lm_Sales5)
##
## Call:
## lm(formula = Sales ~ Customers + Promo + Assortment, data = Train_train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24095.4 -896.2 -152.7 744.9 28421.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.088e+03 3.992e+00 272.5 <2e-16 ***
## Customers 6.618e+00 4.361e-03 1517.7 <2e-16 ***
## Promo 1.304e+03 3.332e+00 391.3 <2e-16 ***
## Assortmentb -6.618e+03 1.759e+01 -376.3 <2e-16 ***
## Assortmentc 6.521e+02 3.270e+00 199.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1452 on 799995 degrees of freedom
## Multiple R-squared: 0.7811, Adjusted R-squared: 0.7811
## F-statistic: 7.138e+05 on 4 and 799995 DF, p-value: < 2.2e-16
ggplot(lm_Sales5, aes(x=.resid))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
lm_Sales6 <- lm(Sales~Customers+Promo+Promo2, Train_train2)
summary(lm_Sales6)
##
## Call:
## lm(formula = Sales ~ Customers + Promo + Promo2, data = Train_train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20891.3 -941.9 -155.1 802.6 29268.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.575e+03 4.737e+00 332.51 <2e-16 ***
## Customers 6.106e+00 4.686e-03 1303.05 <2e-16 ***
## Promo 1.397e+03 3.704e+00 377.22 <2e-16 ***
## Promo2 1.992e+02 3.697e+00 53.88 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1618 on 799996 degrees of freedom
## Multiple R-squared: 0.7284, Adjusted R-squared: 0.7284
## F-statistic: 7.152e+05 on 3 and 799996 DF, p-value: < 2.2e-16
ggplot(lm_Sales6, aes(x=.resid))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
lm_Sales7 <- lm(Sales~Customers+Promo+SchoolHoliday, Train_train2)
summary(lm_Sales7)
##
## Call:
## lm(formula = Sales ~ Customers + Promo + SchoolHoliday, data = Train_train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20617.4 -948.1 -156.7 807.2 29441.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.694e+03 4.095e+00 413.7 <2e-16 ***
## Customers 6.052e+00 4.593e-03 1317.6 <2e-16 ***
## Promo 1.403e+03 3.708e+00 378.4 <2e-16 ***
## SchoolHoliday 9.816e+01 4.587e+00 21.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1620 on 799996 degrees of freedom
## Multiple R-squared: 0.7276, Adjusted R-squared: 0.7276
## F-statistic: 7.122e+05 on 3 and 799996 DF, p-value: < 2.2e-16
ggplot(lm_Sales7, aes(x=.resid))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#prediction using model 3.1
salesPred1 <- predict(lm_Sales1,newdata = Test_data,interval = "prediction")
summary(salesPred1)
## fit lwr upr
## Min. : 4014 Min. :-1358 Min. : 9386
## 1st Qu.: 5519 1st Qu.: 147 1st Qu.:10890
## Median : 6474 Median : 1102 Median :11845
## Mean : 6645 Mean : 1273 Mean :12016
## 3rd Qu.: 7880 3rd Qu.: 2509 3rd Qu.:13252
## Max. :14121 Max. : 8749 Max. :19493
## NA's :65 NA's :65 NA's :65
plot(salesPred1)
Train_test$predSales3.1 <- predict(lm_Sales3.1, newdata= Train_test)
Test_data$predSales <- predict(lm_Sales3.1, newdata= Test_data)
ggplot(data = Test_data, mapping = aes(x=Test_data$Store,y=Test_data$predSales))+
geom_bar(stat = "identity")